In this tutorial we'll demonstrate Coach's hierarchical RL support, by building a new agent that implements the Hierarchical Actor Critic (HAC) algorithm (https://arxiv.org/pdf/1712.00948.pdf), and a preset that runs the agent on Mujoco's pendulum challenge.
First, some imports. Note that HAC is based on DDPG, hence we will be importing the relevant classes.
In [ ]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
sys.path.append(module_path + '/rl_coach')
from typing import Union
import numpy as np
from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.core_types import RunPhase
Now let's define the HAC algorithm and agent parameters.
See tutorial 1 for more details on the content of each of these classes.
In [ ]:
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
def __init__(self):
super().__init__()
self.sub_goal_testing_rate = 0.5
self.time_limit = 40
class HACDDPGAgentParameters(DDPGAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = DDPGAlgorithmParameters()
Now we'll define the agent itself - HACDDPGAgent
- which subclasses the DDPG agent class. The main difference between the DDPG agent and the HACDDPGAgent is the subgoal a higher level agent defines to a lower level agent, hence the overrides of the DDPG Agent functions.
In [ ]:
class HACDDPGAgent(DDPGAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
self.graph_manager = None
def choose_action(self, curr_state):
# top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
# testing phase
graph_manager = self.parent_level_manager.parent_graph_manager
if self.ap.is_a_highest_level_agent:
graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate
if self.phase == RunPhase.TRAIN:
if graph_manager.should_test_current_sub_goal:
self.exploration_policy.change_phase(RunPhase.TEST)
else:
self.exploration_policy.change_phase(self.phase)
action_info = super().choose_action(curr_state)
return action_info
def update_transition_before_adding_to_replay_buffer(self, transition):
graph_manager = self.parent_level_manager.parent_graph_manager
# deal with goals given from a higher level agent
if not self.ap.is_a_highest_level_agent:
transition.state['desired_goal'] = self.current_hrl_goal
transition.next_state['desired_goal'] = self.current_hrl_goal
self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
self.current_hrl_goal, transition.next_state))
goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
self.current_hrl_goal, transition.next_state)
transition.reward = goal_reward
transition.game_over = transition.game_over or sub_goal_reached
# each level tests its own generated sub goals
if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
_, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
transition.action, transition.next_state)
sub_goal_is_missed = not sub_goal_reached
if sub_goal_is_missed:
transition.reward = -self.ap.algorithm.time_limit
return transition
def set_environment_parameters(self, spaces: SpacesDefinition):
super().set_environment_parameters(spaces)
if self.ap.is_a_highest_level_agent:
# the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
# their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
self.spaces.goal = self.spaces.action
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
if not self.ap.is_a_highest_level_agent:
self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward
Defining the top agent in the hierarchy. Note that the agent's base parameters are the same as the DDPG agent's parameters. We also define here the memory, exploration policy and network topology.
In [ ]:
from rl_coach.architectures.tensorflow_components.layers import Dense
from rl_coach.base_parameters import VisualizationParameters, EmbeddingMergerType, EmbedderScheme
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \
EpisodicHindsightExperienceReplayParameters
from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \
EpisodicHRLHindsightExperienceReplayParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.spaces import GoalsSpace, ReachingGoal
from rl_coach.exploration_policies.ou_process import OUProcessParameters
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase, TrainingSteps
time_limit = 1000
polar_coordinates = False
distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])
goals_space = GoalsSpace('achieved_goal',
ReachingGoal(default_reward=-1, goal_reaching_reward=0,
distance_from_goal_threshold=distance_from_goal_threshold),
lambda goal, state: np.abs(goal - state)) # raw L1 distance
top_agent_params = HACDDPGAgentParameters()
# memory - Hindsight Experience Replay
top_agent_params.memory = EpisodicHRLHindsightExperienceReplayParameters()
top_agent_params.memory.max_size = (MemoryGranularity.Transitions, 10000000)
top_agent_params.memory.hindsight_transitions_per_regular_transition = 3
top_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
top_agent_params.memory.goals_space = goals_space
top_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(32)
top_agent_params.algorithm.num_consecutive_training_steps = 40
top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
# exploration - OU process
top_agent_params.exploration = OUProcessParameters()
top_agent_params.exploration.theta = 0.1
# actor - note that the default middleware is overriden with 3 dense layers
top_actor = top_agent_params.network_wrappers['actor']
top_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
top_actor.middleware_parameters.scheme = [Dense([64])] * 3
top_actor.learning_rate = 0.001
top_actor.batch_size = 4096
# critic - note that the default middleware is overriden with 3 dense layers
top_critic = top_agent_params.network_wrappers['critic']
top_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
top_critic.embedding_merger_type = EmbeddingMergerType.Concat
top_critic.middleware_parameters.scheme = [Dense([64])] * 3
top_critic.learning_rate = 0.001
top_critic.batch_size = 4096
The bottom agent
In [ ]:
from rl_coach.schedules import ConstantSchedule
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
bottom_agent_params = HACDDPGAgentParameters()
bottom_agent_params.algorithm.in_action_space = goals_space
bottom_agent_params.memory = EpisodicHindsightExperienceReplayParameters()
bottom_agent_params.memory.max_size = (MemoryGranularity.Transitions, 12000000)
bottom_agent_params.memory.hindsight_transitions_per_regular_transition = 4
bottom_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
bottom_agent_params.memory.goals_space = goals_space
bottom_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16 * 25) # 25 episodes is one true env episode
bottom_agent_params.algorithm.num_consecutive_training_steps = 40
bottom_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
bottom_agent_params.exploration = EGreedyParameters()
bottom_agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)
bottom_agent_params.exploration.evaluation_epsilon = 0
bottom_agent_params.exploration.continuous_exploration_policy_parameters = OUProcessParameters()
bottom_agent_params.exploration.continuous_exploration_policy_parameters.theta = 0.1
# actor
bottom_actor = bottom_agent_params.network_wrappers['actor']
bottom_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
bottom_actor.middleware_parameters.scheme = [Dense([64])] * 3
bottom_actor.learning_rate = 0.001
bottom_actor.batch_size = 4096
# critic
bottom_critic = bottom_agent_params.network_wrappers['critic']
bottom_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
bottom_critic.embedding_merger_type = EmbeddingMergerType.Concat
bottom_critic.middleware_parameters.scheme = [Dense([64])] * 3
bottom_critic.learning_rate = 0.001
bottom_critic.batch_size = 4096
Now we define the parameters of all the agents in the hierarchy from top to bottom
In [ ]:
agents_params = [top_agent_params, bottom_agent_params]
Define the environment, visualization and schedule parameters. The schedule parameters refer to the top level agent.
In [ ]:
from rl_coach.environments.gym_environment import Mujoco
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod
from rl_coach.graph_managers.hrl_graph_manager import HRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
env_params = Mujoco()
env_params.level = "rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals"
env_params.additional_simulator_parameters = {"time_limit": time_limit,
"random_goals_instead_of_standing_goal": False,
"polar_coordinates": polar_coordinates,
"goal_reaching_thresholds": distance_from_goal_threshold}
env_params.frame_skip = 10
env_params.custom_reward_threshold = -time_limit + 1
vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST)]
vis_params.dump_mp4 = False
vis_params.native_rendering = False
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64) # 40 epochs
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(4 * 64) # 4 small batches of 64 episodes
schedule_params.evaluation_steps = EnvironmentEpisodes(64)
schedule_params.heatup_steps = EnvironmentSteps(0)
Lastly, we create a HRLGraphManager
that will execute the hierarchical agent we defined according to the parameters.
Note that the bottom level agent will run 40 steps on each single step of the top level agent.
In [ ]:
graph_manager = HRLGraphManager(agents_params=agents_params, env_params=env_params,
schedule_params=schedule_params, vis_params=vis_params,
consecutive_steps_to_run_each_level=EnvironmentSteps(40))
graph_manager.visualization_parameters.render = True
In [ ]:
from rl_coach.base_parameters import TaskParameters, Frameworks
log_path = '../experiments/pendulum_hac'
if not os.path.exists(log_path):
os.makedirs(log_path)
task_parameters = TaskParameters(framework_type=Frameworks.tensorflow,
evaluate_only=False,
experiment_path=log_path)
task_parameters.__dict__['checkpoint_save_secs'] = None
task_parameters.__dict__['verbosity'] = 'low'
graph_manager.create_graph(task_parameters)
graph_manager.improve()
In [ ]: